In [1]:
# Remember: library imports are ALWAYS at the top of the script, no exceptions!
import sqlite3
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

from itertools import product
from scipy.stats import skewnorm

from datetime import datetime
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

# for better resolution plots
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'

####### Disable warnings #######
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Setting seaborn style
sns.set()

Context¶

The data we will be using through the pratical classes comes from a small relational database whose schema can be seen below: alt text

Reading the Data¶

In [2]:
# path to database
my_path = os.path.join("..", "data", "datamining.db")

# connect to the database
conn = sqlite3.connect(my_path)

# the query
query = """
select
    age, 
    income, 
    frq, 
    rcn, 
    mnt, 
    clothes, 
    kitchen, 
    small_appliances, 
    toys, 
    house_keeping,
    dependents, 
    per_net_purchase,
    g.gender, 
    e.education, 
    m.status, 
    r.description
from customers as c
    join genders as g on g.id = c.gender_id
    join education_levels as e on e.id = c.education_id
    join marital_status as m on m.id = c.marital_status_id
    join recommendations as r on r.id = c.recommendation_id
order by c.id;
"""

df = pd.read_sql_query(query, conn)

Make a copy of your original dataset¶

why?

In [3]:
df_original = df.copy()

Metadata¶

  • id - The unique identifier of the customer
  • age - The year of birht of the customer
  • income - The income of the customer
  • frq - Frequency: number of purchases made by the customer
  • rcn - Recency: number of days since last customer purchase
  • mnt - Monetary: amount of € spent by the customer in purchases
  • clothes - Number of clothes items purchased by the customer
  • kitchen - Number of kitchen items purchased by the customer
  • small_appliances - Number of small_appliances items purchased by the customer
  • toys - Number of toys items purchased by the customer
  • house_keeping - Number of house_keeping items purchased by the customer
  • dependents - Binary. Whether or not the customer has dependents
  • per_net_purchase - Percentage of purchases made online
  • education - Education level of the customer
  • status - Marital status of the customer
  • gender - Gender of the customer
  • description - Last customer's recommendation description

Problems:¶

  • Duplicates?
  • Data types?
  • Missing values?
  • Strange values?
  • Descriptive statistics?

Take a closer look and point out possible problems:¶

(hint: a missing values in pandas is represented with a NaN value)

Replicating modifications from previous notebook¶

In [4]:
# replace "" by nans
df.replace("", np.nan, inplace=True)

# count of missing values
df.isna().sum()
Out[4]:
age                   0
income               46
frq                   0
rcn                   0
mnt                   0
clothes               0
kitchen               0
small_appliances      0
toys                  0
house_keeping         0
dependents          282
per_net_purchase      0
gender                0
education            47
status              177
description           0
dtype: int64
In [5]:
# fix wrong dtypes
df.dependents = df.dependents.astype("boolean")  # converting to "boolean" over "bool" allows preservation of NaNs
In [6]:
# check dataset data types again
df.dtypes
Out[6]:
age                   int64
income              float64
frq                   int64
rcn                   int64
mnt                   int64
clothes               int64
kitchen               int64
small_appliances      int64
toys                  int64
house_keeping         int64
dependents          boolean
per_net_purchase      int64
gender               object
education            object
status               object
description          object
dtype: object
In [7]:
# check descriptive statistics again
df.describe(include="all").T
Out[7]:
count unique top freq mean std min 25% 50% 75% max
age 8998.0 NaN NaN NaN 1966.05968 17.296552 1936.0 1951.0 1966.0 1981.0 1996.0
income 8952.0 NaN NaN NaN 69963.550827 27591.556226 10000.0 47741.0 70030.5 92218.0 140628.0
frq 8998.0 NaN NaN NaN 19.848077 10.903435 3.0 10.0 17.0 28.0 59.0
rcn 8998.0 NaN NaN NaN 62.469771 69.761802 0.0 26.0 53.0 79.0 549.0
mnt 8998.0 NaN NaN NaN 622.162814 646.768205 6.0 63.0 383.0 1076.0 3052.0
clothes 8998.0 NaN NaN NaN 50.446655 23.422249 1.0 33.0 51.0 69.0 99.0
kitchen 8998.0 NaN NaN NaN 7.039675 7.848139 0.0 2.0 4.0 10.0 75.0
small_appliances 8998.0 NaN NaN NaN 28.524116 12.586437 1.0 19.0 28.0 37.0 74.0
toys 8998.0 NaN NaN NaN 7.036897 7.924422 0.0 2.0 4.0 10.0 62.0
house_keeping 8998.0 NaN NaN NaN 6.929984 7.882655 0.0 2.0 4.0 9.0 77.0
dependents 8716 2 True 6164 NaN NaN NaN NaN NaN NaN NaN
per_net_purchase 8998.0 NaN NaN NaN 42.428984 18.495742 4.0 28.0 45.0 57.0 88.0
gender 8998 2 M 5784 NaN NaN NaN NaN NaN NaN NaN
education 8951 6 Graduation 4429 NaN NaN NaN NaN NaN NaN NaN
status 8821 6 Married 3273 NaN NaN NaN NaN NaN NaN NaN
description 8998 5 OK nice! 3434 NaN NaN NaN NaN NaN NaN NaN
In [8]:
#Define metric and non-metric features. Why?
non_metric_features = ["education", "status", "gender", "dependents", "description"]
metric_features = df.columns.drop(non_metric_features).to_list()

Data Preprocessing¶

Fill missing values¶

https://statisticsbyjim.com/basics/missing-data/



Read: Section 2.4.2

Han, J., Pei, J., & Tong, H. (2022). Data mining: Concepts and Techniques (4th ed.). Morgan Kaufmann.


Read: Chapter 4

García, S., Luengo, J., & Herrera, F. (2014). Data preprocessing in data mining. Springer.

How can we fill missing values?¶

Measures of central tendency¶

In [9]:
# Creating a copy to apply central tendency measures imputation
df_central = df.copy()
In [10]:
# count of missing values
df_central.isna().sum()
Out[10]:
age                   0
income               46
frq                   0
rcn                   0
mnt                   0
clothes               0
kitchen               0
small_appliances      0
toys                  0
house_keeping         0
dependents          282
per_net_purchase      0
gender                0
education            47
status              177
description           0
dtype: int64
In [11]:
means = df_central[metric_features].mean()
means
Out[11]:
age                  1966.059680
income              69963.550827
frq                    19.848077
rcn                    62.469771
mnt                   622.162814
clothes                50.446655
kitchen                 7.039675
small_appliances       28.524116
toys                    7.036897
house_keeping           6.929984
per_net_purchase       42.428984
dtype: float64
In [12]:
# df_central.median(numeric_only=True)

# Instead of saying numeric_only = True, 
# you can explicitly give a df filtered to only the numeric features

# [Alternative]
medians = df_central[metric_features].median()
medians
Out[12]:
age                  1966.0
income              70030.5
frq                    17.0
rcn                    53.0
mnt                   383.0
clothes                51.0
kitchen                 4.0
small_appliances       28.0
toys                    4.0
house_keeping           4.0
per_net_purchase       45.0
dtype: float64

Mean or Median?¶

In [13]:
#####################
## You don't need to understand this code
#####################


skew_demo = np.round(skewnorm.rvs(180, size=1500, random_state=68410237)*100, 0).astype(int)

skew_md = np.median(skew_demo)
skew_mn = np.mean(skew_demo)

sns.set_theme(style="white", palette=None)
fig, ax = plt.subplots(figsize=(8,5))

sns.histplot(skew_demo, bins=20,
             ax=ax, color='tab:orange')

ax.axvline(skew_md, color='black', linewidth=4,
           label='Median: {}'.format(int(skew_md)))
ax.axvline(skew_mn, color='blue', linestyle='dashed', linewidth=4,
           label='Mean: {}'.format(int(skew_mn)))

ax.legend(handlelength=5)
ax.set_title("Histogram")
plt.show()

sns.set_theme(style="white")
No description has been provided for this image

The mean is the average of all values. The median is the middle value when a data set is ordered from least to greatest. If a data set has an odd number of values, the median is the middle value. If a data set has an even number of values, the median is the average of the two middle values.


Não ouvi o pq, mas ela escolheu a mediana.

What about non-numeric features?¶

In [14]:
# Check each step of these commands
modes = df_central[non_metric_features].mode().loc[0]
modes
Out[14]:
education      Graduation
status            Married
gender                  M
dependents           True
description      OK nice!
Name: 0, dtype: object
In [ ]:
 
In [15]:
# Fill NaNs on df_central
df_central[metric_features] = df_central[metric_features].fillna(medians)
df_central[non_metric_features] = df_central[non_metric_features].fillna(modes)

df_central.isna().sum()  # checking how many NaNs we still have
Out[15]:
age                 0
income              0
frq                 0
rcn                 0
mnt                 0
clothes             0
kitchen             0
small_appliances    0
toys                0
house_keeping       0
dependents          0
per_net_purchase    0
gender              0
education           0
status              0
description         0
dtype: int64

KNN¶

In [16]:
# Creating new df copy to explore neighbordhood imputation
df_neighbors = df.copy()
In [17]:
# Seeing rows with NaNs
nans_index = df_neighbors.isna().any(axis=1)
df_neighbors[nans_index]
Out[17]:
age income frq rcn mnt clothes kitchen small_appliances toys house_keeping dependents per_net_purchase gender education status description
3 1955 93571.0 26 10 888 60 10 19 6 5 True 35 F Master NaN OK nice!
61 1968 NaN 14 90 184 95 1 3 1 0 True 59 F 2nd Cycle Married Meh...
67 1981 60457.0 9 73 63 37 2 51 6 3 <NA> 57 M PhD Married Meh...
70 1944 116259.0 28 35 1279 17 6 54 3 20 False 11 M Graduation NaN OK nice!
73 1967 75274.0 16 67 263 88 3 6 1 2 <NA> 66 M Graduation Single Kind of OK
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8932 1959 NaN 20 71 716 67 5 25 3 1 False 27 M Graduation Married Meh...
8942 1950 92746.0 34 98 1399 50 5 36 5 4 <NA> 28 M Master Married Take my money!!
8964 1978 44661.0 6 49 33 18 18 35 16 13 True 42 M 1st Cycle NaN OK nice!
8972 1976 NaN 17 9 254 70 1 28 1 1 True 59 M Graduation Divorced Meh...
8992 1954 87399.0 25 1 837 56 8 27 8 1 <NA> 47 M Graduation Married Kind of OK

542 rows × 16 columns

In [18]:
# KNNImputer - only works for numerical variables. Fill NaNs on df_neighbors
imputer = KNNImputer(n_neighbors=5, weights="uniform")
df_neighbors[metric_features] = imputer.fit_transform(df_neighbors[metric_features])
In [19]:
# See rows with NaNs imputed
df_neighbors.loc[nans_index, metric_features]

# [Alternative]
# df_neighbors[nans_index]
Out[19]:
age income frq rcn mnt clothes kitchen small_appliances toys house_keeping per_net_purchase
3 1955.0 93571.0 26.0 10.0 888.0 60.0 10.0 19.0 6.0 5.0 35.0
61 1968.0 63143.0 14.0 90.0 184.0 95.0 1.0 3.0 1.0 0.0 59.0
67 1981.0 60457.0 9.0 73.0 63.0 37.0 2.0 51.0 6.0 3.0 57.0
70 1944.0 116259.0 28.0 35.0 1279.0 17.0 6.0 54.0 3.0 20.0 11.0
73 1967.0 75274.0 16.0 67.0 263.0 88.0 3.0 6.0 1.0 2.0 66.0
... ... ... ... ... ... ... ... ... ... ... ...
8932 1959.0 85643.4 20.0 71.0 716.0 67.0 5.0 25.0 3.0 1.0 27.0
8942 1950.0 92746.0 34.0 98.0 1399.0 50.0 5.0 36.0 5.0 4.0 28.0
8964 1978.0 44661.0 6.0 49.0 33.0 18.0 18.0 35.0 16.0 13.0 42.0
8972 1976.0 63932.4 17.0 9.0 254.0 70.0 1.0 28.0 1.0 1.0 59.0
8992 1954.0 87399.0 25.0 1.0 837.0 56.0 8.0 27.0 8.0 1.0 47.0

542 rows × 11 columns

Keep central tendency imputation¶

Para a 2º observação:
KNN     = 63k
Mean    = 70k
Median  = 70k

Why?

Porque a mediana é mais robusta a outliers e o KNN está mau pq não teve MinMaxScaler.

In [20]:
# let's keep the central imputation
df = df_central.copy()
In [21]:
df_central.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8998 entries, 0 to 8997
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               8998 non-null   int64  
 1   income            8998 non-null   float64
 2   frq               8998 non-null   int64  
 3   rcn               8998 non-null   int64  
 4   mnt               8998 non-null   int64  
 5   clothes           8998 non-null   int64  
 6   kitchen           8998 non-null   int64  
 7   small_appliances  8998 non-null   int64  
 8   toys              8998 non-null   int64  
 9   house_keeping     8998 non-null   int64  
 10  dependents        8998 non-null   boolean
 11  per_net_purchase  8998 non-null   int64  
 12  gender            8998 non-null   object 
 13  education         8998 non-null   object 
 14  status            8998 non-null   object 
 15  description       8998 non-null   object 
dtypes: boolean(1), float64(1), int64(10), object(4)
memory usage: 1.0+ MB

An overview of our previous data exploration¶

You can also explore this dataset using the exported pandas-profiling report.

Outlier removal¶

Why do we need to remove outliers? Which methods can we use?

Outlier removal using only the IQR method¶

Why should you use/not use this method?

Inter-Quartile Range formula:


$IQR = Q_3 – Q_1$




Outliers are the observations that fall

  • below $Q_1 − 1.5 \times IQR$

or

  • above $Q_3 + 1.5 \times IQR$


https://online.stat.psu.edu/stat200/lesson/3/3.2

In [22]:
# Compute the IQR
# HINT: Use .quantile()

# Calculate Q1, Q3, IQR
q25 = q1= df[metric_features].quantile(.25)
q75 = q3 = df[metric_features].quantile(.75)
iqr = (q75 - q25)

# Compute upper and lower limit 
# lower_limit = Q1 -1.5*IQR
upper_lim = q75 + 1.5 * iqr

# upper_limit = Q3 + 1.5*IQR
lower_lim = q25 - 1.5 * iqr
In [23]:
filters_iqr = []
for metric in metric_features:
    llim = lower_lim[metric]
    ulim = upper_lim[metric]
    filters_iqr.append(df[metric].between(llim, ulim, inclusive='neither'))

filters_iqr_all = pd.Series(np.all(filters_iqr, 0))
In [24]:
filters_iqr_all
Out[24]:
0        True
1        True
2       False
3        True
4        True
        ...  
8993     True
8994     True
8995     True
8996     True
8997    False
Length: 8998, dtype: bool
In [23]:
# filters_iqr
# filters_iqr_all
In [24]:
filters_iqr_all.sum()
Out[24]:
7339
In [25]:
df_iqr = df[filters_iqr_all]
print('Percentage of data kept after removing outliers:', 100*(np.round(df_iqr.shape[0] / df_original.shape[0], 4)))
Percentage of data kept after removing outliers: 81.56

What do you think of this number?

$20\%$ of the data is considered an outlier $\rightarrow$ Para a professora, é um valor muito alto. (No máximo $5\%$)

In general we want to keep as much of our data as possible.

As a rule of thumb, try not to remove more than 5% of your rows.

*This is only a rule of thumb!*

In some cases it really is necessary to remove more.

Manual outlier removal¶

Now let's try "manually" filtering the dataset's outliers


Let's look at the boxplots again:

In [26]:
# This may vary from session to session, and is prone to varying interpretations.
# A simple example is provided below:

# ( df['feature1']<= limit1 ) & ( df['feature2']<= limit2 ) ...


filters_man = (
    #   ( df['feature1']<= limit1 ) 
    # & ( df['feature2']<= limit2 )
    # ...
    (df['house_keeping']<=50)
    &
    (df['kitchen']<=40)
    &
    (df['toys']<=35)
    &
    (df['education']!='OldSchool')
)

df_man = df[filters_man]
In [27]:
print('Percentage of data kept after removing outliers:', 100*np.round(df_man.shape[0] / df_original.shape[0], 4))
Percentage of data kept after removing outliers: 97.99
Depende do domínio, mas a professora escolheu remover os outliers.

Combining different outlier methods¶

More robust/ consistent outlier detection method:

In [28]:
## What is this doing?
df_out = df[(filters_iqr_all | filters_man)] 

print('Percentage of data kept after removing outliers:', np.round(df_out.shape[0] / df_original.shape[0], 4))
Percentage of data kept after removing outliers: 0.9819
In [29]:
# Get the manual filtering version
df = df_man.copy()

Outlier threshold value¶

You may change the values of observations for a given variable:

In [30]:
# Remember the 'rcn' variable had an odd behavior? 
# This is the time to fix that

df['birth_year'] = df['age']
df['age'] = datetime.now().year - df['birth_year']

df['spent_online'] = (df['per_net_purchase'] / 100) * df['mnt']
In [31]:
print((df['rcn']>100).value_counts())

rcn_t = df['rcn'].copy()
rcn_t.loc[rcn_t>100] = 100

df['rcn'] = rcn_t
rcn
False    8437
True      380
Name: count, dtype: int64

Exercise¶

Visualize your boxplots and histograms before and after outlier removal.

In [32]:
# Boxplot before and after
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

sns.boxplot(data=df_original[['rcn']], ax=ax[0])
ax[0].set_title('Before')

sns.boxplot(data=df[['rcn']], ax=ax[1])
ax[1].set_title('After')

plt.show()
No description has been provided for this image
In [33]:
sns.set_style('whitegrid')

frows = 2
fcols = ceil(len(metric_features)/frows)

fig = plt.figure(figsize=(20,8))
subfigs = fig.subfigures(frows, fcols, wspace=.03, hspace=.03)

for sfig, feat in zip(subfigs.flatten(), metric_features):
    axes = sfig.subplots(2, 1, sharex=True)
    
    sns.boxplot(x=df_original[feat], ax=axes[0])
    axes[0].set_ylabel('Original')
    axes[0].set_title(feat, fontsize='large')
    
    
    sns.boxplot(x=df[feat], ax=axes[1])
    axes[1].set_ylabel('Outliers\nRemoved')
    axes[1].set_xlabel('')

    # Subfigure Aesthetics
    sfig.set_facecolor("#F9F9F9")
    sfig.subplots_adjust(left=.2, right=.95, bottom=.1,)

    
plt.savefig(os.path.join('..', 'figures', 'eda', 'boxplots_before_after_outliers.png'), dpi=200)
plt.show()


sns.set()
No description has been provided for this image

Feature Engineering¶

A reminder of our metadata:

  • id - The unique identifier of the customer
  • age - The year of birht of the customer
  • income - The income of the customer
  • frq - Frequency: number of purchases made by the customer
  • rcn - Recency: number of days since last customer purchase
  • mnt - Monetary: amount of € spent by the customer in purchases
  • clothes - Number of clothes items purchased by the customer
  • kitchen - Number of kitchen items purchased by the customer
  • small_appliances - Number of small_appliances items purchased by the customer
  • toys - Number of toys items purchased by the customer
  • house_keeping - Number of house_keeping items purchased by the customer
  • dependents - Binary. Whether or not the customer has dependents
  • per_net_purchase - Percentage of purchases made online
  • education - Education level of the customer
  • status - Marital status of the customer
  • gender - Gender of the customer
  • description - Last customer's recommendation description
In [34]:
# Try to pull as many features as you can; You can always drop them later on
# You can start by converting the 'age' variable to the actual age
# You can also store the birth year in another feature

new_features = []
df['birth_year'] = df['age']
df['age'] = 2024 - df['birth_year']
new_features.append('birth_year')
new_features
Out[34]:
['birth_year']
In [35]:
df['spent_online'] = df['per_net_purchase']*df['mnt']/100
new_features.append('spent_online')
new_features
Out[35]:
['birth_year', 'spent_online']
In [36]:
df.describe()
Out[36]:
age income frq rcn mnt clothes kitchen small_appliances toys house_keeping per_net_purchase birth_year spent_online
count 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000 8817.000000
mean 1965.664739 70534.387887 19.990360 52.220937 630.162640 51.181127 6.776908 28.673925 6.581263 6.764319 42.241239 58.335261 178.378072
std 17.075958 27246.776600 10.918369 30.098895 647.834648 22.984454 7.310850 12.595621 6.895266 7.540019 18.482272 17.075958 168.577960
min 1936.000000 10000.000000 3.000000 0.000000 6.000000 1.000000 0.000000 1.000000 0.000000 0.000000 4.000000 28.000000 2.730000
25% 1951.000000 48842.000000 10.000000 26.000000 64.000000 34.000000 2.000000 19.000000 2.000000 2.000000 27.000000 44.000000 34.650000
50% 1966.000000 70388.000000 18.000000 53.000000 387.000000 52.000000 4.000000 28.000000 4.000000 4.000000 45.000000 58.000000 140.000000
75% 1980.000000 92413.000000 28.000000 78.000000 1079.000000 70.000000 9.000000 37.000000 9.000000 9.000000 57.000000 73.000000 265.100000
max 1996.000000 140628.000000 59.000000 100.000000 3052.000000 99.000000 40.000000 74.000000 35.000000 50.000000 88.000000 88.000000 1416.240000

A aula (7/11/2024) só começa AQUI! :)

Please download Lab 7 :)¶

Start 10:05

Variable selection: Redundancy VS Relevancy¶

Redundancy: Handling highly correlated variables¶

We already saw our original correlation matrix:

  • Depois de criarmos as novas variáveis, vamos verificar a correlação entre elas e verficar se temos variáveis redundantes.

    • income e age | mnt e frq são variáveis redundantes
    • Pq q removemos? Pq vamos dar mais peso à mesma informação
    • Qual vamos remover? EDA é importante para decidir
      • (p.e., quando analisamos os boxplots, podemos ver que a mnt e frq e ver se têm outliers -> frq melhor/ quando analisamos os histogramas, podemos procurar por distribuições normais -> Neste caso frq é ligeiramente mais normal que mnt)
      • (p.e. quando vemos a age e income podemos ver que a income tem uma distribuição mais normal que a age, mas income tem missing values e ambos não têm outliers -> Neste caso tmb podemos pensar qual será a variável mais fácil de obter corretamente dos dados para o negócio -> Ela optou por remover a age)
      • Também podemos pensar qual será a mais fácil de explicar no cluster -> age é mais fácil de explicar que a income -> Umas são mais fáceis de ser para serem INPUT do clustering e outras para PROFILE
  • Quando fazemos a Feature Engineering temos de fazer a matriz de correlação again para ver se temos novas variáveis redundantes

In [37]:
# Select variables according to their correlations
# Updating metric_features

## RUN THIS CELL ONLY ONCE!!!!

# CODE HERE
metric_features.append("spent_online")
metric_features.remove("mnt")
metric_features.remove("age")

metric_features
Out[37]:
['income',
 'frq',
 'rcn',
 'clothes',
 'kitchen',
 'small_appliances',
 'toys',
 'house_keeping',
 'per_net_purchase',
 'spent_online']
In [38]:
# You can also drop the features from the df
# But sometimes they may be useful for cluster profiling later

# df.drop(..., inplace=True)

Relevancy¶

Selecting variables based on the relevancy of each one to the task.

Example:

  • remove uncorrelated variables with the target,
  • stepwise regression,
  • use variables for product clustering,
  • use variables for socio-demographic clustering,
  • ...

Variables that aren't correlated with any other variable are often also not relevant. In this case we will not focus on this a lot since we don't have a defined task yet.

Redo data exploration¶

Check if the data looks the way you expect it to.

  • Have you missed some outliers?
  • Are there still missing values?
  • Is the data normalized?

This is an iterative process. It is likely you will change your preprocessing steps frequently throughout your group work.

Data Normalization¶

In [39]:
# Remember our original data
df.describe().round(2)
Out[39]:
age income frq rcn mnt clothes kitchen small_appliances toys house_keeping per_net_purchase birth_year spent_online
count 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00
mean 1965.66 70534.39 19.99 52.22 630.16 51.18 6.78 28.67 6.58 6.76 42.24 58.34 178.38
std 17.08 27246.78 10.92 30.10 647.83 22.98 7.31 12.60 6.90 7.54 18.48 17.08 168.58
min 1936.00 10000.00 3.00 0.00 6.00 1.00 0.00 1.00 0.00 0.00 4.00 28.00 2.73
25% 1951.00 48842.00 10.00 26.00 64.00 34.00 2.00 19.00 2.00 2.00 27.00 44.00 34.65
50% 1966.00 70388.00 18.00 53.00 387.00 52.00 4.00 28.00 4.00 4.00 45.00 58.00 140.00
75% 1980.00 92413.00 28.00 78.00 1079.00 70.00 9.00 37.00 9.00 9.00 57.00 73.00 265.10
max 1996.00 140628.00 59.00 100.00 3052.00 99.00 40.00 74.00 35.00 50.00 88.00 88.00 1416.24

MinMax Scaling¶

Transforms values to be between [0,1]

$$ x' = \frac{x - min(X)}{max(X) - min(X)}$$

where:

  • $x$ is an original value
  • $x'$ is the normalized value

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [40]:
## Let's look at an example:
demo_ages = pd.DataFrame({"Age": [14, 24, 22, 18, 18, 26, 11, 23, 13, 12]})
demo_ages
Out[40]:
Age
0 14
1 24
2 22
3 18
4 18
5 26
6 11
7 23
8 13
9 12
In [41]:
# How to do this with Python/Pandas?
#  ( demo_ages - min(demo_ages) ) 
# --------------------------------- 
#  max(demo_ages) - min(demo_ages)

demo_ages
Out[41]:
Age
0 14
1 24
2 22
3 18
4 18
5 26
6 11
7 23
8 13
9 12
In [42]:
min_age = demo_ages["Age"].min()
max_age = demo_ages["Age"].max()

mm_age = (demo_ages["Age"]-min_age )/(max_age-min_age)
pd.DataFrame(mm_age)
Out[42]:
Age
0 0.200000
1 0.866667
2 0.733333
3 0.466667
4 0.466667
5 1.000000
6 0.000000
7 0.800000
8 0.133333
9 0.066667

Remember the formula:

$$x' = \frac{x - min(X)}{max(X) - min(X)}$$

where:

  • $x$ is an original value
  • $x'$ is the normalized value

(This is not showing all rows)

Age (Orig) Formula Result
14 (14 - 11) / 15 0.2
24 (24 - 11) / 15 0.86666667
26 (26 - 11) / 15 1
11 (11 - 11) / 15 0
In [43]:
# sklearn makes it even easier:
demo_ages["Age_minmax"] = MinMaxScaler().fit_transform(demo_ages[["Age"]])
demo_ages
Out[43]:
Age Age_minmax
0 14 0.200000
1 24 0.866667
2 22 0.733333
3 18 0.466667
4 18 0.466667
5 26 1.000000
6 11 0.000000
7 23 0.800000
8 13 0.133333
9 12 0.066667

Now do it for our dataset¶

In [44]:
df_minmax = df.copy()
In [45]:
# Use MinMaxScaler to scale the data
mm_scaler = MinMaxScaler()
mm_scaled_feat = mm_scaler.fit_transform(df[metric_features])
mm_scaled_feat
Out[45]:
array([[0.61841259, 0.53571429, 0.66      , ..., 0.06      , 0.17857143,
        0.1865215 ],
       [0.78867471, 0.51785714, 0.06      , ..., 0.04      , 0.05952381,
        0.0959314 ],
       [0.14042931, 0.14285714, 0.69      , ..., 0.48      , 0.6547619 ,
        0.01643427],
       ...,
       [0.36838197, 0.16071429, 0.06      , ..., 0.12      , 0.79761905,
        0.02870868],
       [0.33906972, 0.46428571, 0.72      , ..., 0.22      , 0.32142857,
        0.21979328],
       [0.88772698, 0.625     , 0.75      , ..., 0.12      , 0.48809524,
        0.52908717]])
In [46]:
# See what the fit method is doing (notice the trailing underscore):
print("Parameters fitted:\n")
pd.DataFrame(
    [mm_scaler.data_min_, mm_scaler.data_max_], 
    columns=metric_features, 
    index=['min','max'])
Parameters fitted:

Out[46]:
income frq rcn clothes kitchen small_appliances toys house_keeping per_net_purchase spent_online
min 10000.0 3.0 0.0 1.0 0.0 1.0 0.0 0.0 4.0 2.73
max 140628.0 59.0 100.0 99.0 40.0 74.0 35.0 50.0 88.0 1416.24
In [47]:
## Replace original metric_features values with mm_scaled_feat values
df_minmax[metric_features] = mm_scaled_feat
In [48]:
# Checking max and min of minmaxed variables
df_minmax[metric_features].describe().round(2)
Out[48]:
income frq rcn clothes kitchen small_appliances toys house_keeping per_net_purchase spent_online
count 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00
mean 0.46 0.30 0.52 0.51 0.17 0.38 0.19 0.14 0.46 0.12
std 0.21 0.19 0.30 0.23 0.18 0.17 0.20 0.15 0.22 0.12
min 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
25% 0.30 0.12 0.26 0.34 0.05 0.25 0.06 0.04 0.27 0.02
50% 0.46 0.27 0.53 0.52 0.10 0.37 0.11 0.08 0.49 0.10
75% 0.63 0.45 0.78 0.70 0.22 0.49 0.26 0.18 0.63 0.19
max 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
In [49]:
df_minmax[metric_features].head()
Out[49]:
income frq rcn clothes kitchen small_appliances toys house_keeping per_net_purchase spent_online
0 0.618413 0.535714 0.66 0.367347 0.125 0.589041 0.285714 0.06 0.178571 0.186521
1 0.788675 0.517857 0.06 0.551020 0.025 0.506849 0.114286 0.04 0.059524 0.095931
2 0.140429 0.142857 0.69 0.316327 0.475 0.315068 0.028571 0.48 0.654762 0.016434
3 0.639763 0.410714 0.10 0.602041 0.250 0.246575 0.171429 0.10 0.369048 0.217947
4 0.626604 0.500000 0.26 0.591837 0.125 0.369863 0.114286 0.08 0.357143 0.271799
In [50]:
df[metric_features].head()
Out[50]:
income frq rcn clothes kitchen small_appliances toys house_keeping per_net_purchase spent_online
0 90782.0 33 66 37 5 44 10 3 19 266.38
1 113023.0 32 6 55 1 38 4 2 9 138.33
2 28344.0 11 69 32 19 24 1 24 59 25.96
3 93571.0 26 10 60 10 19 6 5 35 310.80
4 91852.0 31 26 59 5 28 4 4 34 386.92

Standard Scaling (AKA Z-Score Scaling)¶

Standardize features by removing the mean and scaling to unit variance.

The standard score of a sample x is calculated as:

$$z = (x - u) / s$$

where:

  • $u$ is the mean of the training samples,

  • $s$ is the standard deviation of the training samples

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [51]:
## Let's see the age demo again
demo_ages
Out[51]:
Age Age_minmax
0 14 0.200000
1 24 0.866667
2 22 0.733333
3 18 0.466667
4 18 0.466667
5 26 1.000000
6 11 0.000000
7 23 0.800000
8 13 0.133333
9 12 0.066667
$$z = (x - u) / s$$

where:

  • $u$ is the mean of the training samples,
  • $s$ is the standard deviation of the training samples
In [52]:
## u = mean of age
## s = std of age

# How to do this in Python/Pandas?
# ( demo_ages - mean(demo_ages) )
# -------------------------------
#        std(demo_ages)
In [53]:
## u = mean of age
## s = std of age

mean_age = demo_ages["Age"].mean()
std_age  = demo_ages["Age"].std(ddof = 0) # to match sklearn implementation of std

#      ddof : int, default 1
#             Delta degrees of freedom.  The divisor used in calculations
#             is ``N - ddof``, where ``N`` represents the number of elements.
In [54]:
pd.DataFrame((demo_ages["Age"] - mean_age) / std_age)
Out[54]:
Age
0 -0.793614
1 1.142031
2 0.754902
3 -0.019356
4 -0.019356
5 1.529160
6 -1.374308
7 0.948466
8 -0.987179
9 -1.180743
In [55]:
# Or use sklearn:
demo_ages["Age_standard"] = StandardScaler().fit_transform(demo_ages[["Age"]])
demo_ages
Out[55]:
Age Age_minmax Age_standard
0 14 0.200000 -0.793614
1 24 0.866667 1.142031
2 22 0.733333 0.754902
3 18 0.466667 -0.019356
4 18 0.466667 -0.019356
5 26 1.000000 1.529160
6 11 0.000000 -1.374308
7 23 0.800000 0.948466
8 13 0.133333 -0.987179
9 12 0.066667 -1.180743
In [56]:
demo_ages.describe().round(2)
Out[56]:
Age Age_minmax Age_standard
count 10.00 10.00 10.00
mean 18.10 0.47 -0.00
std 5.45 0.36 1.05
min 11.00 0.00 -1.37
25% 13.25 0.15 -0.94
50% 18.00 0.47 -0.02
75% 22.75 0.78 0.90
max 26.00 1.00 1.53

Now do it for our dataset¶

In [57]:
df_standard = df.copy()
In [58]:
ss_scaler = StandardScaler() # finish this line
ss_scaled_feat = ss_scaler.fit_transform(df[metric_features]) # finish this line
ss_scaled_feat
Out[58]:
array([[ 0.74316169,  1.1916046 ,  0.45781896, ..., -0.49927359,
        -1.25755953,  0.52205471],
       [ 1.55948802,  1.10001063, -1.53572277, ..., -0.63190677,
        -1.79864925, -0.23757758],
       [-1.5485421 , -0.82346271,  0.55749604, ...,  2.28602317,
         0.90679936, -0.90419125],
       ...,
       [-0.4556169 , -0.73186874, -1.53572277, ..., -0.10137405,
         1.55610702, -0.80126567],
       [-0.59615525,  0.82522873,  0.65717313, ...,  0.56179185,
        -0.60825186,  0.80105125],
       [ 2.03439682,  1.64957444,  0.75685022, ..., -0.10137405,
         0.14927375,  3.39459784]])
In [59]:
# See what the fit method is doing (notice the trailing underscore):
print("Parameters fitted:\n")
pd.DataFrame([ss_scaler.mean_, np.sqrt(ss_scaler.var_)], columns=metric_features, index=['mean','std'])
Parameters fitted:

Out[59]:
income frq rcn clothes kitchen small_appliances toys house_keeping per_net_purchase spent_online
mean 70534.387887 19.990360 52.220937 51.181127 6.776908 28.673925 6.581263 6.764319 42.241239 178.378072
std 27245.231428 10.917749 30.097188 22.983151 7.310435 12.594907 6.894875 7.539592 18.481223 168.568400
In [60]:
df_standard[metric_features] = ss_scaled_feat
df_standard.head()
Out[60]:
age income frq rcn mnt clothes kitchen small_appliances toys house_keeping dependents per_net_purchase gender education status description birth_year spent_online
0 1946 0.743162 1.191605 0.457819 1402 -0.617023 -0.243065 1.216847 0.495837 -0.499274 False -1.257560 M Graduation Together Take my money!! 78 0.522055
1 1936 1.559488 1.100011 -1.535723 1537 0.166160 -0.790228 0.740464 -0.374374 -0.631907 False -1.798649 F PhD Divorced Take my money!! 88 -0.237578
2 1990 -1.548542 -0.823463 0.557496 44 -0.834573 1.672006 -0.371096 -0.809480 2.286023 True 0.906799 M Graduation Married Kind of OK 34 -0.904191
3 1955 0.845528 0.550447 -1.402820 888 0.383710 0.440889 -0.768082 -0.084304 -0.234007 True -0.391816 F Master Married OK nice! 69 0.785568
4 1955 0.782435 1.008417 -0.871209 1138 0.340200 -0.243065 -0.053508 -0.374374 -0.366640 True -0.445925 F Graduation Together Take my money!! 69 1.237135
In [61]:
# Checking mean and variance of standardized variables
df_standard[metric_features].describe().round(2)
Out[61]:
income frq rcn clothes kitchen small_appliances toys house_keeping per_net_purchase spent_online
count 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00 8817.00
mean -0.00 -0.00 0.00 -0.00 -0.00 -0.00 0.00 0.00 -0.00 -0.00
std 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
min -2.22 -1.56 -1.74 -2.18 -0.93 -2.20 -0.95 -0.90 -2.07 -1.04
25% -0.80 -0.92 -0.87 -0.75 -0.65 -0.77 -0.66 -0.63 -0.82 -0.85
50% -0.01 -0.18 0.03 0.04 -0.38 -0.05 -0.37 -0.37 0.15 -0.23
75% 0.80 0.73 0.86 0.82 0.30 0.66 0.35 0.30 0.80 0.51
max 2.57 3.57 1.59 2.08 4.54 3.60 4.12 5.73 2.48 7.34

Important: What if we had a training and test set? Should we fit a Scaler in both? What about other Sklearn objects?

Yes, we should fit the Scaler in the training set and transform both the training and test set.

Compare MinMaxScaler vs StandardScaler vs Original data¶

Boxplots¶

In [62]:
sns.set_style('whitegrid')
fig, axes = plt.subplots(2,3, figsize=(8,5), tight_layout=True, sharex='col', sharey='row')

bp_feat_l = 'house_keeping'

sns.boxplot(df_original, x=bp_feat_l, ax=axes[0][0], width=.4)
axes[0][0].set_title('Original')
axes[0][0].set_ylabel(bp_feat_l)

sns.boxplot(df_minmax, x=bp_feat_l, ax=axes[0][1], width=.4)
axes[0][1].set_title('MinMaxScaler()')

sns.boxplot(df_standard, x=bp_feat_l, ax=axes[0][2], width=.4)
axes[0][2].set_title('StandardScaler()')



bp_feat_r = 'income'

sns.boxplot(df_original, x=bp_feat_r, ax=axes[1][0], width=.4)
axes[1][0].set_ylabel(bp_feat_r)

sns.boxplot(df_minmax, x=bp_feat_r, ax=axes[1][1], width=.4)

sns.boxplot(df_standard, x=bp_feat_r, ax=axes[1][2], width=.4)


axes[1][0].set_xlabel(None)
axes[1][1].set_xlabel(None)
axes[1][2].set_xlabel(None)

fig.suptitle('Boxplots: "{}" and "{}"'.format(bp_feat_l, bp_feat_r))
plt.show()
No description has been provided for this image

Neste caso deviamos usar o StandardScaler pq apesar de termos outliers, a StandardScaler mantém simétrica a distribuição dos dados.

Os outliers são mantidos, mas dependedo do scaler, a distribuição é diferente.

In [63]:
sns.set_style('white')

fig, axes = plt.subplots(3,1, figsize=(8,5), tight_layout=True,)

sns.boxplot(df_standard, x=bp_feat_l, ax=axes[0], width=.4)
sns.boxplot(df_minmax, x=bp_feat_l, ax=axes[1], width=.4)
sns.boxplot(df, x=bp_feat_l, ax=axes[2], width=.4)

axes[0].set_title("standard")
axes[1].set_title("mm")
axes[2].set_title("origi")

plt.show()
No description has been provided for this image

Histograms¶

What about histograms?

Exercise: Visualize histograms here, compare the shapes of the original distribution with the MinMaxScaler and StandardScaler results.

In [64]:
sns.set_style('white')

# Create subplots with 3 rows, 1 column:
fig, axes = plt.subplots(3,1, 
                         figsize=(5,5), tight_layout=True)

# Which feature do you want to visualize?
hp_feat = 'house_keeping'

# common function args for histplot
hp_args = dict(x=hp_feat, bins=15)


sns.histplot(df_original, ax=axes[0], **hp_args)
axes[0].set_title('{}: Original'.format(hp_feat))
axes[0].set_xlabel(None)

sns.histplot(df_minmax, ax=axes[1], **hp_args)
axes[1].set_title('{}: MinMaxScaler()'.format(hp_feat))
axes[1].set_xlabel(None)

sns.histplot(df_standard, ax=axes[2], **hp_args)
axes[2].set_title('{}: StandardScaler()'.format(hp_feat))
axes[2].set_xlabel(None)


plt.show()

sns.set()
No description has been provided for this image

Não põe magicamente a distribuição normal, mas muda a escala dos dados.

In [65]:
# sns.set()

Keep StandardScaler¶

In [66]:
df = df_standard.copy()

One-Hot Encoding¶

In [67]:
df_ohc = df.copy()
In [68]:
# First let's remove status=Whatever
df_ohc.loc[df_ohc['status'] == 'Whatever', 'status'] = df['status'].mode()[0]
In [69]:
# Use OneHotEncoder to encode the categorical features. Get feature names and create a DataFrame 
# with the one-hot encoded categorical features (pass feature names)
ohc = OneHotEncoder(sparse_output=False, drop="first")
ohc_feat = ohc.fit_transform(df_ohc[non_metric_features])
ohc_feat_names = ohc.get_feature_names_out()
In [70]:
# Get feature names and create a DataFrame 
# with the one-hot encoded categorical features (pass feature names)
ohc_feat = ohc.fit_transform(df_ohc[non_metric_features])
ohc_feat_names = ohc.get_feature_names_out()
In [71]:
ohc_df = pd.DataFrame(ohc_feat, index=df_ohc.index, columns=ohc_feat_names)  # Why the index=df_ohc.index?
ohc_df
Out[71]:
education_2nd Cycle education_Graduation education_Master education_PhD status_Married status_Single status_Together status_Widow gender_M dependents_1.0 description_Kind of OK description_Meh... description_OK nice! description_Take my money!!
0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
1 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0
3 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
4 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8992 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0
8993 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
8994 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0
8995 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
8996 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0

8817 rows × 14 columns

In [72]:
# Reassigning df to contain ohc variables
df_ohc = pd.concat([df_ohc, ohc_df], axis=1)
df_ohc.head()
Out[72]:
age income frq rcn mnt clothes kitchen small_appliances toys house_keeping ... status_Married status_Single status_Together status_Widow gender_M dependents_1.0 description_Kind of OK description_Meh... description_OK nice! description_Take my money!!
0 1946 0.743162 1.191605 0.457819 1402 -0.617023 -0.243065 1.216847 0.495837 -0.499274 ... 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
1 1936 1.559488 1.100011 -1.535723 1537 0.166160 -0.790228 0.740464 -0.374374 -0.631907 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2 1990 -1.548542 -0.823463 0.557496 44 -0.834573 1.672006 -0.371096 -0.809480 2.286023 ... 1.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0
3 1955 0.845528 0.550447 -1.402820 888 0.383710 0.440889 -0.768082 -0.084304 -0.234007 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
4 1955 0.782435 1.008417 -0.871209 1138 0.340200 -0.243065 -0.053508 -0.374374 -0.366640 ... 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0

5 rows × 32 columns

In [73]:
df = df_ohc.copy()
In [74]:
list(df.columns)
Out[74]:
['age',
 'income',
 'frq',
 'rcn',
 'mnt',
 'clothes',
 'kitchen',
 'small_appliances',
 'toys',
 'house_keeping',
 'dependents',
 'per_net_purchase',
 'gender',
 'education',
 'status',
 'description',
 'birth_year',
 'spent_online',
 'education_2nd Cycle',
 'education_Graduation',
 'education_Master',
 'education_PhD',
 'status_Married',
 'status_Single',
 'status_Together',
 'status_Widow',
 'gender_M',
 'dependents_1.0',
 'description_Kind of OK',
 'description_Meh...',
 'description_OK nice!',
 'description_Take my money!!']
In [75]:
df_ohc[non_metric_features]
Out[75]:
education status gender dependents description
0 Graduation Together M False Take my money!!
1 PhD Divorced F False Take my money!!
2 Graduation Married M True Kind of OK
3 Master Married F True OK nice!
4 Graduation Together F True Take my money!!
... ... ... ... ... ...
8992 Graduation Married M True Kind of OK
8993 1st Cycle Single F True Take my money!!
8994 2nd Cycle Single M True Meh...
8995 Graduation Together M False Take my money!!
8996 2nd Cycle Married M True Take my money!!

8817 rows × 5 columns

In [ ]:
df_ohc["education"].unique() # 5 categories -> 4 dummies
Out[ ]:
array(['Graduation', 'PhD', 'Master', '2nd Cycle', '1st Cycle'],
      dtype=object)
In [77]:
ohc_df
Out[77]:
education_2nd Cycle education_Graduation education_Master education_PhD status_Married status_Single status_Together status_Widow gender_M dependents_1.0 description_Kind of OK description_Meh... description_OK nice! description_Take my money!!
0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
1 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0
3 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
4 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8992 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0
8993 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
8994 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0
8995 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
8996 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0

8817 rows × 14 columns

Nem sempre o One-Hot Encoding é a melhor opção. Depende do conhecimento dos dados que temos. $\rightarrow$ Label Encoding seria a melhor opção aqui provavelmente.

In [78]:
# You can also choose to drop the original non_metric_features from your df
# df.drop(columns=non_metric_features)

Rename OHE columns¶

In [79]:
## Rename OHE columns from "feature_val_a" to "oh_feature_val_a"
## e.g. status_Widow to oh_status_Widow
## We do this to be able to distinguish the OHE columns more easily later

## Assemble OHE columns and their new column names
rename_ohe_cols = {}

for i in non_metric_features:
    for j in df.columns[df.columns.str.startswith(i)].to_list() :
        if j not in non_metric_features:
            rename_ohe_cols[j] = 'oh_' + j

df.rename(columns=rename_ohe_cols, inplace=True)
In [80]:
df
Out[80]:
age income frq rcn mnt clothes kitchen small_appliances toys house_keeping ... oh_status_Married oh_status_Single oh_status_Together oh_status_Widow oh_gender_M oh_dependents_1.0 oh_description_Kind of OK oh_description_Meh... oh_description_OK nice! oh_description_Take my money!!
0 1946 0.743162 1.191605 0.457819 1402 -0.617023 -0.243065 1.216847 0.495837 -0.499274 ... 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
1 1936 1.559488 1.100011 -1.535723 1537 0.166160 -0.790228 0.740464 -0.374374 -0.631907 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2 1990 -1.548542 -0.823463 0.557496 44 -0.834573 1.672006 -0.371096 -0.809480 2.286023 ... 1.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0
3 1955 0.845528 0.550447 -1.402820 888 0.383710 0.440889 -0.768082 -0.084304 -0.234007 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
4 1955 0.782435 1.008417 -0.871209 1138 0.340200 -0.243065 -0.053508 -0.374374 -0.366640 ... 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8992 1954 0.618993 0.458853 -1.701851 837 0.209670 0.167308 -0.132905 0.205767 -0.764540 ... 1.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0
8993 1960 0.874744 0.733635 -1.701851 896 0.731791 -0.243065 -0.609288 -0.519409 -0.366640 ... 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
8994 1975 -0.455617 -0.731869 -1.535723 61 0.079139 -0.106274 -0.053508 0.060732 -0.101374 ... 0.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0
8995 1986 -0.596155 0.825229 0.657173 1011 -0.442982 0.577680 0.581670 -0.809480 0.561792 ... 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
8996 1938 2.034397 1.649574 0.756850 1668 0.427220 -0.653437 -0.291699 -0.229339 -0.101374 ... 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0

8817 rows × 32 columns

In [81]:
list(df.columns)
Out[81]:
['age',
 'income',
 'frq',
 'rcn',
 'mnt',
 'clothes',
 'kitchen',
 'small_appliances',
 'toys',
 'house_keeping',
 'dependents',
 'per_net_purchase',
 'gender',
 'education',
 'status',
 'description',
 'birth_year',
 'spent_online',
 'oh_education_2nd Cycle',
 'oh_education_Graduation',
 'oh_education_Master',
 'oh_education_PhD',
 'oh_status_Married',
 'oh_status_Single',
 'oh_status_Together',
 'oh_status_Widow',
 'oh_gender_M',
 'oh_dependents_1.0',
 'oh_description_Kind of OK',
 'oh_description_Meh...',
 'oh_description_OK nice!',
 'oh_description_Take my money!!']

Redo data exploration¶

Check if the data looks the way you expect it to.

  • Have you missed some outliers?
  • Are there still missing values?
  • Is the data normalized?

This is an iterative process. It is likely you will change your preprocessing steps frequently throughout your group work.

In [82]:
metric_features
Out[82]:
['income',
 'frq',
 'rcn',
 'clothes',
 'kitchen',
 'small_appliances',
 'toys',
 'house_keeping',
 'per_net_purchase',
 'spent_online']
In [83]:
non_metric_features
Out[83]:
['education', 'status', 'gender', 'dependents', 'description']
In [84]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 8817 entries, 0 to 8996
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             8817 non-null   int64  
 1   income                          8817 non-null   float64
 2   frq                             8817 non-null   float64
 3   rcn                             8817 non-null   float64
 4   mnt                             8817 non-null   int64  
 5   clothes                         8817 non-null   float64
 6   kitchen                         8817 non-null   float64
 7   small_appliances                8817 non-null   float64
 8   toys                            8817 non-null   float64
 9   house_keeping                   8817 non-null   float64
 10  dependents                      8817 non-null   boolean
 11  per_net_purchase                8817 non-null   float64
 12  gender                          8817 non-null   object 
 13  education                       8817 non-null   object 
 14  status                          8817 non-null   object 
 15  description                     8817 non-null   object 
 16  birth_year                      8817 non-null   int64  
 17  spent_online                    8817 non-null   float64
 18  oh_education_2nd Cycle          8817 non-null   float64
 19  oh_education_Graduation         8817 non-null   float64
 20  oh_education_Master             8817 non-null   float64
 21  oh_education_PhD                8817 non-null   float64
 22  oh_status_Married               8817 non-null   float64
 23  oh_status_Single                8817 non-null   float64
 24  oh_status_Together              8817 non-null   float64
 25  oh_status_Widow                 8817 non-null   float64
 26  oh_gender_M                     8817 non-null   float64
 27  oh_dependents_1.0               8817 non-null   float64
 28  oh_description_Kind of OK       8817 non-null   float64
 29  oh_description_Meh...           8817 non-null   float64
 30  oh_description_OK nice!         8817 non-null   float64
 31  oh_description_Take my money!!  8817 non-null   float64
dtypes: boolean(1), float64(24), int64(3), object(4)
memory usage: 2.2+ MB
In [85]:
# Save the cleaned data - Part 2
df.to_csv(os.path.join("..", "data", "cleaned_data_part2.csv"), index=False)

A tool to assist you through your exploratory data analysis¶

Optionally, you may use pandas-profiling in your data analysis.

Remember, although this tool provides excelent insights about the data you're working with, it is not enough to perform a proper analysis.

In [86]:
from ydata_profiling import ProfileReport

ProfileReport(
    df,
    title='Tugas Customer Data Preprocessed',
    correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": False},
        "kendall": {"calculate": False},
        "phi_k": {"calculate": False},
        "cramers": {"calculate": False},
    },
)
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[86]: